The following markdown provides a basic rundown of quality control of samples (project directory: )
# QC output 1:
load(file.path(sampleDir,"geno_probs/QC_1.RData"))
# cross object
load(file.path(sampleDir,"geno_probs/cross.RData"))
# x chromosome intensities
xint_file <- file.path(sampleDir,"qtl2genos/chrXint.csv")
xint <- read_csv_numer(xint_file, transpose=TRUE)
# y chromosome intensities
yint_file <- file.path(sampleDir,"qtl2genos/chrYint.csv")
yint <- read_csv_numer(yint_file, transpose=TRUE)
There are 266 samples in the experiment, comprising 266 females and 0 males. There were a total of 123882 informative markers.
# label values with mouse ids
labels <- paste0(names(percent_missing), " (", round(percent_missing), "%)")
# count number of samples at different missingness thresholds
missing_thresh <- unlist(lapply(c(1,5,10,20), function(x) length(percent_missing[percent_missing > x])))
# plot
qtlcharts::iplot(x = seq_along(percent_missing),
y = percent_missing,
indID=labels,
chartOpts=list(xlab="Mouse",
ylab="Percent missing genotype data",
ylim=c(0, 100))) %>% suppressMessages()
There were:
Below is a list of sample pairs flagged as sample duplicates (if there were any). A histogram of the proportion of matching genotypes between all sample pairs is also provided.
# default qtl2 report for sample duplicates
summary(cg)
## No pairs with proportion matches above 0.9.
# histogram
hist(cg[upper.tri(cg)], breaks=seq(0, 1, length=201),
main="", yaxt="n", ylab="", xlab="Proportion matching genotypes")
rug(cg[upper.tri(cg)])
# take average X chromosome probe intensity for non-null markers
xint[is.na(xint)] <- mean(xint[!is.na(xint)])
xint_ave <- rowMeans(xint[complete.cases(xint),])
# take average Y chromosome probe intensity for non-null markers
yint[is.na(yint)] <- mean(yint[!is.na(yint)])
yint_ave <- rowMeans(yint[complete.cases(yint),])
ave_int <- data.frame(cbind(xint_ave, yint_ave)) %>%
dplyr::mutate(sample = rownames(.))
ave_int <- cbind(ave_int,cross$covar)
# make cutoffs df
categories_df <- data.frame(c(0.25,0.5,0.25,0.5),
c(0.5,0.5,0.1,0.1),
c("M","XXY","XO","F"))
colnames(categories_df) <- c("X","Y","sex")
# sex palette
pal <- c("green","brown","pink","purple")
names(pal) <- categories_df$sex
# categorize samples
ave_int <- ave_int %>%
dplyr::mutate(predicted_sex = dplyr::case_when(xint_ave > 0.45 & yint_ave < 0.15 ~ "F",
xint_ave < 0.45 & yint_ave < 0.15 ~ "XO",
xint_ave > 0.45 & yint_ave > 0.15 ~ "XXY",
xint_ave < 0.45 & yint_ave > 0.15 ~ "M"),
sample = rownames(.))
ave_int <- cbind(ave_int, percent_missing)
sex_check_plot <- ggplot() +
geom_jitter(ave_int, mapping = aes(x = xint_ave, y = yint_ave,
fill = predicted_sex,
label = sample,
label2 = percent_missing),
size = 3, shape = 21) +
scale_fill_manual(values = pal) +
geom_hline(yintercept = 0.15) +
geom_vline(xintercept = 0.45) +
labs(x = "Mean Chr X Probe Intensity",
y = "Mean Chr Y Probe Intensity")%>%
suppressWarnings()
## Warning: Ignoring unknown aesthetics: label, label2
# interactive sex check plot
plotly::ggplotly(sex_check_plot, tooltip = c("sample","percent_missing"))
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the plotly package.
## Please report the issue at <]8;;https://github.com/plotly/plotly.R/issueshttps://github.com/plotly/plotly.R/issues]8;;>.
pHet <- rowSums(cross$geno$X == 2)/rowSums(cross$geno$X != 0)
pHet <- cbind(ave_int, pHet)
pHet_plot <- ggplot() +
geom_jitter(pHet, mapping = aes(x = xint_ave, y = pHet,
fill = predicted_sex,
label = sample,
label2 = percent_missing),
size = 2, shape = 21) +
scale_fill_manual(values = pal) +
geom_hline(yintercept = 0.15) +
geom_vline(xintercept = 0.45) +
labs(x = "Mean Chr X Probe Intensity",
y = "Proportion Heterozygous Chr X Genotypes") %>%
suppressWarnings()
## Warning: Ignoring unknown aesthetics: label, label2
# interactive pHet plot
plotly::ggplotly(pHet_plot, tooltip = c("sample","percent_missing"))